from bs4 import BeautifulSoup
import requests
from pymongo import MongoClient
import time

client = MongoClient('mongodb://localhost:27017/')
db = client['cookbook']
collection = db['cooks2']


def get_page(url):
    wb_data = requests.get(url=url)
    return wb_data.text


def get_page_links(text):
    soup = BeautifulSoup(text, 'lxml')
    links = soup.select('#listtyle1_list > div.listtyle1 > a')
    for link in links:
        yield link.get('href')


def get_details(link):
    detail_page = requests.get(url=link)
    soup = BeautifulSoup(detail_page.text, 'lxml')

    pic_url = soup.select(
        'body > div.main_w.clearfix > div.main.clearfix > div.cp_header.clearfix > div.cp_headerimg_w > img')
    title = soup.select('#tongji_title')
    tags = soup.select(
        'dl.yj_tags.clearfix > dt')
    user_info = soup.select(
        'body > div.main_w.clearfix > div.main.clearfix > div.cp_header.clearfix > div.cp_main_info_w > div.info3 > div > div > span')
    date_info = soup.select(
        'body > div.main_w.clearfix > div.main.clearfix > div.cp_header.clearfix > div.cp_main_info_w > div.info3 > div > div > strong')

    steps = soup.select(
        'body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix > div.cp_body_left > div.measure > div.editnew.edit > div > div > p')

    ready_time = soup.select('#tongji_zbsj')

    if len(ready_time) > 0:
        ready_time = ready_time[0].text
    else:
        ready_time = "none"

    cost_time = soup.select('#tongji_prsj')
    if len(cost_time) > 0:
        cost_time = cost_time[0].text
    else:
        cost_time = "none"

    material_pics = soup.select(
        'body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix > div.cp_body_left > div.materials > div > div.yl.zl.clearfix > ul > li > a > img')
    materials = soup.select(
        'body > div.main_w.clearfix > div.main.clearfix > div.cp_body.clearfix > div.cp_body_left > div.materials > div > div.yl.zl.clearfix > ul > li > div > h4 > a')

    item = {
        'pic_url': pic_url[0].get('src'),
        'title': title[0].text,
        'tags': [tag.text for tag in tags],
        'user_info': user_info[0].text,
        'date_info': date_info[0].text,
        'steps': [step.text for step in steps if step.text],
        'step_urls': [step.img.get('src') for step in steps if step.img],
        'ready_time': ready_time,
        'cost_time': cost_time,
        'materials': [material.text for material in materials],
        'material_pics': [material_pic.get('src') for material_pic in material_pics],
    }

    collection.insert_one(item)
    # print(item)
    print('one complete')


if __name__ == '__main__':
    base_url = 'http://www.meishij.net/list.php?lm=13&page='
    for i in range(50):
        time.sleep(1)
        url = base_url + str(i)
        text = get_page(url=url)
        for link in get_page_links(text):
            get_details(link)
